\relax 
\providecommand\hyper@newdestlabel[2]{}
\providecommand*\HyPL@Entry[1]{}
\citation{peters-etal:2018:_deep,radford-etal:2018}
\citation{dai-le:2015:_semi,peters-etal:2018:_deep,radford-etal:2018,howard-ruder:2018}
\citation{bowman-etal:2015,williams-nangia-bowman:2018}
\citation{dolan-brockett:2005:_autom}
\citation{tjong-de:2003,rajpurkar-etal:2016:_squad}
\citation{peters-etal:2018:_deep}
\citation{radford-etal:2018}
\citation{vaswani-etal:2017:_atten}
\HyPL@Entry{0<</S/D>>}
\citation{taylor:1953:_cloze}
\citation{radford-etal:2018}
\citation{peters-etal:2018:_deep}
\citation{brown-etal:1992:_class,ando-zhang:2005,blitzer-mcdonald-pereira:2006:_domain}
\citation{mikolov-etal:2013,pennington-socher-manning:2014:_glove}
\citation{turian-ratinov-bengio:2010:_word_repres}
\citation{minh09}
\citation{mikolov-etal:2013}
\citation{kiros-etal:2015:_skip,logeswaran2018an}
\citation{le-mikolov:2014:_distr}
\citation{DBLP:journals/corr/JerniteBS17,logeswaran2018an}
\citation{kiros-etal:2015:_skip}
\citation{hill16}
\citation{peters-etal:2017:_semi,peters-etal:2018:_deep}
\citation{peters-etal:2018:_deep}
\citation{rajpurkar-etal:2016:_squad}
\citation{socher-etal:2013:_recur}
\citation{tjong-de:2003}
\citation{melamud2016context2vec}
\citation{fedus2018maskgan}
\citation{collobert-weston:2008}
\citation{dai-le:2015:_semi,howard-ruder:2018,radford-etal:2018}
\citation{radford-etal:2018}
\citation{wang-etal:2018:_glue}
\citation{howard-ruder:2018,radford-etal:2018,dai-le:2015:_semi}
\citation{conneau-EtAl:2017:EMNLP2017}
\citation{mccann-etal:2017:_learn_trans}
\citation{imagenet_cvpr09,yosinski2014transferable}
\newlabel{sec:bert}{{3}{3}{BERT}{section.3}{}}
\citation{vaswani-etal:2017:_atten}
\citation{vaswani-etal:2017:_atten}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:bert_overall}{{1}{4}{Overall pre-training and fine-tuning procedures for BERT. Apart from output layers, the same architectures are used in both pre-training and fine-tuning. The same pre-trained model parameters are used to initialize models for different down-stream tasks. During fine-tuning, all parameters are fine-tuned. {\tt [CLS]} is a special symbol added in front of every input example, and {\tt [SEP]} is a special separator token (e.g. separating questions/answers)}{figure.caption.1}{}}
\citation{wu-etal:2016:_googl}
\citation{peters-etal:2018:_deep}
\citation{radford-etal:2018}
\citation{taylor:1953:_cloze}
\citation{vincent:2008}
\newlabel{sec:pretraining_tasks}{{3.1}{5}{Pre-training BERT}{subsection.3.1}{}}
\citation{DBLP:journals/corr/JerniteBS17}
\citation{logeswaran2018an}
\citation{zhu:2015}
\citation{chelba-etal:2013:_one}
\citation{parikh-etal:2016,bidaf}
\newlabel{fig:input_embeddings}{{2}{6}{BERT input representation. The input embeddings are the sum of the token embeddings, the segmentation embeddings and the position embeddings}{figure.caption.4}{}}
\citation{wang-etal:2018:_glue}
\newlabel{sec:finetuning_procedure}{{3.2}{7}{Fine-tuning BERT}{subsection.3.2}{}}
\newlabel{sec:experiments}{{4}{7}{Experiments}{section.4}{}}
\newlabel{sec:glue}{{4.1}{7}{GLUE}{subsection.4.1}{}}
\citation{rajpurkar-etal:2016:_squad}
\citation{bidaf,clark-gardner:2018:_simpl,peters-etal:2018:_deep,hu2017reinforced}
\citation{yu-etal:2018:_qanet}
\citation{joshi-etal:2017:_triviaq}
\newlabel{tab:glue_official}{{1}{8}{GLUE Test results, scored by the evaluation server ({\small \url {https://gluebenchmark.com/leaderboard}}). The number below each task denotes the number of training examples. The ``Average'' column is slightly different than the official GLUE score, since we exclude the problematic WNLI set.\footnote {See question 10 in \url {https://gluebenchmark.com/faq}.} BERT and OpenAI GPT are single-model, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and accuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components}{table.caption.7}{}}
\newlabel{sec:squad}{{4.2}{8}{SQuAD v1.1}{subsection.4.2}{}}
\citation{unet,slqa}
\newlabel{tab:squad_results}{{2}{9}{SQuAD 1.1 results. The BERT ensemble is 7x systems which use different pre-training checkpoints and fine-tuning seeds}{table.caption.8}{}}
\newlabel{tab:squad2_results}{{3}{9}{SQuAD 2.0 results. We exclude entries that use BERT as one of their components}{table.caption.9}{}}
\citation{zellers2018swag}
\newlabel{tab:swag_official}{{4}{10}{SWAG Dev and Test accuracies. $^\dagger $Human performance is measured with 100 samples, as reported in the SWAG paper}{table.caption.10}{}}
\newlabel{sec:swag}{{4.4}{10}{SWAG}{subsection.4.4}{}}
\newlabel{sec:ablation}{{5}{10}{Ablation Studies}{section.5}{}}
\newlabel{sec:task_ablation}{{5.1}{10}{Effect of Pre-training Tasks}{subsection.5.1}{}}
\citation{vaswani-etal:2017:_atten}
\citation{alrfou:2018}
\citation{peters2018dissecting}
\citation{melamud2016context2vec}
\newlabel{tab:task_ablation}{{5}{11}{Ablation over the pre-training tasks using the \bertbase architecture. ``No NSP'' is trained without the next sentence prediction task. ``LTR \& No NSP'' is trained as a left-to-right LM without the next sentence prediction, like OpenAI GPT. ``+ BiLSTM'' adds a randomly initialized BiLSTM on top of the ``LTR + No NSP'' model during fine-tuning}{table.caption.11}{}}
\newlabel{sec:model_size_ablation}{{5.2}{11}{Effect of Model Size}{subsection.5.2}{}}
\newlabel{tab:size_ablation}{{6}{11}{Ablation over BERT model size. \#L = the number of layers; \#H = hidden size; \#A = number of attention heads. ``LM (ppl)'' is the masked LM perplexity of held-out training data}{table.caption.12}{}}
\citation{tjong-de:2003}
\citation{peters-etal:2018:_deep}
\citation{clark2018semi}
\citation{akbik2018contextual}
\newlabel{sec:ner}{{5.3}{12}{Feature-based Approach with BERT}{subsection.5.3}{}}
\bibstyle{acl_natbib}
\bibdata{lumiere}
\bibcite{akbik2018contextual}{{1}{2018}{{Akbik et~al.}}{{Akbik, Blythe, and Vollgraf}}}
\bibcite{alrfou:2018}{{2}{2018}{{Al-Rfou et~al.}}{{Al-Rfou, Choe, Constant, Guo, and Jones}}}
\bibcite{ando-zhang:2005}{{3}{2005}{{Ando and Zhang}}{{}}}
\bibcite{bentivogli-etal:2009}{{4}{2009}{{Bentivogli et~al.}}{{Bentivogli, Magnini, Dagan, Dang, and Giampiccolo}}}
\bibcite{blitzer-mcdonald-pereira:2006:_domain}{{5}{2006}{{Blitzer et~al.}}{{Blitzer, McDonald, and Pereira}}}
\bibcite{bowman-etal:2015}{{6}{2015}{{Bowman et~al.}}{{Bowman, Angeli, Potts, and Manning}}}
\bibcite{brown-etal:1992:_class}{{7}{1992}{{Brown et~al.}}{{Brown, Desouza, Mercer, Pietra, and Lai}}}
\bibcite{cer-etal:2017}{{8}{2017}{{Cer et~al.}}{{Cer, Diab, Agirre, Lopez-Gazpio, and Specia}}}
\bibcite{chelba-etal:2013:_one}{{9}{2013}{{Chelba et~al.}}{{Chelba, Mikolov, Schuster, Ge, Brants, Koehn, and Robinson}}}
\bibcite{chen-etal:2018:_quora}{{10}{2018}{{Chen et~al.}}{{Chen, Zhang, Zhang, and Zhao}}}
\bibcite{clark-gardner:2018:_simpl}{{11}{2018}{{Clark and Gardner}}{{}}}
\bibcite{clark2018semi}{{12}{2018}{{Clark et~al.}}{{Clark, Luong, Manning, and Le}}}
\bibcite{collobert-weston:2008}{{13}{2008}{{Collobert and Weston}}{{}}}
\bibcite{conneau-EtAl:2017:EMNLP2017}{{14}{2017}{{Conneau et~al.}}{{Conneau, Kiela, Schwenk, Barrault, and Bordes}}}
\bibcite{dai-le:2015:_semi}{{15}{2015}{{Dai and Le}}{{}}}
\newlabel{tab:ner_results}{{7}{13}{CoNLL-2003 Named Entity Recognition results. Hyperparameters were selected using the Dev set. The reported Dev and Test scores are averaged over 5 random restarts using those hyperparameters}{table.caption.13}{}}
\newlabel{tab:pretrained_embeddings}{{7}{13}{CoNLL-2003 Named Entity Recognition results. Hyperparameters were selected using the Dev set. The reported Dev and Test scores are averaged over 5 random restarts using those hyperparameters}{table.caption.13}{}}
\bibcite{imagenet_cvpr09}{{16}{2009}{{Deng et~al.}}{{Deng, Dong, Socher, Li, Li, and Fei-Fei}}}
\bibcite{dolan-brockett:2005:_autom}{{17}{2005}{{Dolan and Brockett}}{{}}}
\bibcite{fedus2018maskgan}{{18}{2018}{{Fedus et~al.}}{{Fedus, Goodfellow, and Dai}}}
\bibcite{hendrycks:2016}{{19}{2016}{{Hendrycks and Gimpel}}{{}}}
\bibcite{hill16}{{20}{2016}{{Hill et~al.}}{{Hill, Cho, and Korhonen}}}
\bibcite{howard-ruder:2018}{{21}{2018}{{Howard and Ruder}}{{}}}
\bibcite{hu2017reinforced}{{22}{2018}{{Hu et~al.}}{{Hu, Peng, Huang, Qiu, Wei, and Zhou}}}
\bibcite{DBLP:journals/corr/JerniteBS17}{{23}{2017}{{Jernite et~al.}}{{Jernite, Bowman, and Sontag}}}
\bibcite{joshi-etal:2017:_triviaq}{{24}{2017}{{Joshi et~al.}}{{Joshi, Choi, Weld, and Zettlemoyer}}}
\bibcite{kiros-etal:2015:_skip}{{25}{2015}{{Kiros et~al.}}{{Kiros, Zhu, Salakhutdinov, Zemel, Urtasun, Torralba, and Fidler}}}
\bibcite{le-mikolov:2014:_distr}{{26}{2014}{{Le and Mikolov}}{{}}}
\bibcite{levesque-davis-morgenstern:2011:_winog}{{27}{2011}{{Levesque et~al.}}{{Levesque, Davis, and Morgenstern}}}
\bibcite{logeswaran2018an}{{28}{2018}{{Logeswaran and Lee}}{{}}}
\bibcite{mccann-etal:2017:_learn_trans}{{29}{2017}{{McCann et~al.}}{{McCann, Bradbury, Xiong, and Socher}}}
\bibcite{melamud2016context2vec}{{30}{2016}{{Melamud et~al.}}{{Melamud, Goldberger, and Dagan}}}
\bibcite{mikolov-etal:2013}{{31}{2013}{{Mikolov et~al.}}{{Mikolov, Sutskever, Chen, Corrado, and Dean}}}
\bibcite{minh09}{{32}{2009}{{Mnih and Hinton}}{{}}}
\bibcite{parikh-etal:2016}{{33}{2016}{{Parikh et~al.}}{{Parikh, T{\"a}ckstr{\"o}m, Das, and Uszkoreit}}}
\bibcite{pennington-socher-manning:2014:_glove}{{34}{2014}{{Pennington et~al.}}{{Pennington, Socher, and Manning}}}
\bibcite{peters-etal:2017:_semi}{{35}{2017}{{Peters et~al.}}{{Peters, Ammar, Bhagavatula, and Power}}}
\bibcite{peters-etal:2018:_deep}{{36}{2018{a}}{{Peters et~al.}}{{Peters, Neumann, Iyyer, Gardner, Clark, Lee, and Zettlemoyer}}}
\bibcite{peters2018dissecting}{{37}{2018{b}}{{Peters et~al.}}{{Peters, Neumann, Zettlemoyer, and Yih}}}
\bibcite{radford-etal:2018}{{38}{2018}{{Radford et~al.}}{{Radford, Narasimhan, Salimans, and Sutskever}}}
\bibcite{rajpurkar-etal:2016:_squad}{{39}{2016}{{Rajpurkar et~al.}}{{Rajpurkar, Zhang, Lopyrev, and Liang}}}
\bibcite{bidaf}{{40}{2017}{{Seo et~al.}}{{Seo, Kembhavi, Farhadi, and Hajishirzi}}}
\bibcite{socher-etal:2013:_recur}{{41}{2013}{{Socher et~al.}}{{Socher, Perelygin, Wu, Chuang, Manning, Ng, and Potts}}}
\bibcite{unet}{{42}{2018}{{Sun et~al.}}{{Sun, Li, Qiu, and Liu}}}
\bibcite{taylor:1953:_cloze}{{43}{1953}{{Taylor}}{{}}}
\bibcite{tjong-de:2003}{{44}{2003}{{Tjong Kim~Sang and De~Meulder}}{{}}}
\bibcite{turian-ratinov-bengio:2010:_word_repres}{{45}{2010}{{Turian et~al.}}{{Turian, Ratinov, and Bengio}}}
\bibcite{vaswani-etal:2017:_atten}{{46}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}}
\bibcite{vincent:2008}{{47}{2008}{{Vincent et~al.}}{{Vincent, Larochelle, Bengio, and Manzagol}}}
\bibcite{wang-etal:2018:_glue}{{48}{2018{a}}{{Wang et~al.}}{{Wang, Singh, Michael, Hill, Levy, and Bowman}}}
\bibcite{slqa}{{49}{2018{b}}{{Wang et~al.}}{{Wang, Yan, and Wu}}}
\bibcite{warstadt-singh-bowman:2018:_corpus}{{50}{2018}{{Warstadt et~al.}}{{Warstadt, Singh, and Bowman}}}
\bibcite{williams-nangia-bowman:2018}{{51}{2018}{{Williams et~al.}}{{Williams, Nangia, and Bowman}}}
\bibcite{wu-etal:2016:_googl}{{52}{2016}{{Wu et~al.}}{{Wu, Schuster, Chen, Le, Norouzi, Macherey, Krikun, Cao, Gao, Macherey et~al.}}}
\bibcite{yosinski2014transferable}{{53}{2014}{{Yosinski et~al.}}{{Yosinski, Clune, Bengio, and Lipson}}}
\bibcite{yu-etal:2018:_qanet}{{54}{2018}{{Yu et~al.}}{{Yu, Dohan, Luong, Zhao, Chen, Norouzi, and Le}}}
\bibcite{zellers2018swag}{{55}{2018}{{Zellers et~al.}}{{Zellers, Bisk, Schwartz, and Choi}}}
\bibcite{zhu:2015}{{56}{2015}{{Zhu et~al.}}{{Zhu, Kiros, Zemel, Salakhutdinov, Urtasun, Torralba, and Fidler}}}
\HyPL@Entry{14<</S/D>>}
\newlabel{appendix:sec:bert_description}{{A}{2}{Additional Details for BERT}{appendix.A}{}}
\newlabel{sec:pretraining_procedure}{{A.2}{2}{Pre-training Procedure}{subsection.A.2}{}}
\citation{hendrycks:2016}
\newlabel{fig:BERT_comparisons}{{3}{3}{Differences in pre-training model architectures. BERT uses a bidirectional Transformer. OpenAI GPT uses a left-to-right Transformer. ELMo uses the concatenation of independently trained left-to-right and right-to-left LSTMs to generate features for downstream tasks. Among the three, only \bert representations are jointly conditioned on both left and right context in all layers. In addition to the architecture differences, BERT and OpenAI GPT are fine-tuning approaches, while ELMo is a feature-based approach}{figure.caption.15}{}}
\citation{wang-etal:2018:_glue}
\newlabel{appendix:sec:comparing_bert_and_openai}{{A.4}{4}{Comparison of BERT, ELMo ,and OpenAI GPT}{subsection.A.4}{}}
\newlabel{appendix:sec:fine_tune_details_and_figures}{{A.5}{4}{Illustrations of Fine-tuning on Different Tasks}{subsection.A.5}{}}
\citation{williams-nangia-bowman:2018}
\citation{chen-etal:2018:_quora}
\citation{rajpurkar-etal:2016:_squad}
\citation{wang-etal:2018:_glue}
\newlabel{fig:bert_fine_tune}{{4}{5}{Illustrations of Fine-tuning BERT on Different Tasks}{figure.caption.18}{}}
\newlabel{appendix:sec:exp_details}{{B}{5}{Detailed Experimental Setup}{appendix.B}{}}
\newlabel{appendix:sec:glue}{{B.1}{5}{Detailed Descriptions for the GLUE Benchmark Experiments}{subsection.B.1}{}}
\citation{socher-etal:2013:_recur}
\citation{warstadt-singh-bowman:2018:_corpus}
\citation{cer-etal:2017}
\citation{dolan-brockett:2005:_autom}
\citation{bentivogli-etal:2009}
\citation{levesque-davis-morgenstern:2011:_winog}
\newlabel{appendix:sec:more_ablation_studies}{{C}{6}{Additional Ablation Studies}{appendix.C}{}}
\newlabel{sec:num_training_steps}{{C.1}{6}{Effect of Number of Training Steps}{subsection.C.1}{}}
\newlabel{appendix:sec:different_masks}{{C.2}{6}{Ablation for Different Masking Procedures}{subsection.C.2}{}}
\newlabel{tab:mask_ablation}{{8}{7}{Ablation over different masking strategies}{table.caption.29}{}}
\newlabel{fig:step_abalation}{{5}{8}{Ablation over number of training steps. This shows the MNLI accuracy after fine-tuning, starting from model parameters that have been pre-trained for $k$ steps. The x-axis is the value of $k$}{figure.caption.28}{}}
\gdef \@abspage@last{22}
